Ce notebook Jupyter ne contient que les codes et sorties python que nous avons mis en place pour notre travail dans le cadre du défi IA 2023.
La présentation et l'analyse détaillée de notre travail est contenue dans le rapport pdf nommé "BERNADA_ROBERT_rapport.pdf"
import numpy as np
from numpy import arange
import math
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import plotly.express as px
import folium
import pandas as pd
from geopy.geocoders import Nominatim
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.linear_model import RidgeCV, LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import LabelEncoder
# nos données extraites
train = pd.read_csv('all_data.csv') # toutes les données extraites
train1 = pd.read_csv('best_data.csv') # données qui ont de meilleurs résultats
# données sur les hôtels
hotel_features = pd.read_csv('features_hotels.csv',index_col=['hotel_id', 'city'])
# concatener les jeux de données
df_train1 = pd.DataFrame()
df_train = pd.DataFrame()
df_train1 = pd.concat([df_train1, train], axis=0) # toutes les données extraites
df_train = pd.concat([df_train, train1], axis=0) # données qui ont de meilleurs résultats
# données sur lesquelles on s'appuie pour les soumissions
df_pred = pd.read_csv('test_set.csv')
# jeux de données joint avec hotel_features
df_train = df_train.join(hotel_features, on=['hotel_id', 'city'])
df_train1 = df_train1.join(hotel_features, on=['hotel_id', 'city'])
df_pred = df_pred.join(hotel_features, on=['hotel_id', 'city'])
Le jeu de données complet :
display(df_train1)
| Unnamed: 0 | hotel_id | price | stock | city | date | language | mobile | avatar_id | group | brand | parking | pool | children_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 283 | 98 | 104 | amsterdam | 43 | austrian | 0 | 59750 | Independant | Independant | 1 | 0 | 0 |
| 1 | 1 | 387 | 241 | 264 | amsterdam | 43 | austrian | 0 | 59750 | Accar Hotels | Safitel | 1 | 0 | 0 |
| 2 | 2 | 806 | 237 | 261 | amsterdam | 43 | austrian | 0 | 59750 | Chillton Worldwide | Chill Garden Inn | 0 | 0 | 1 |
| 3 | 3 | 465 | 237 | 264 | amsterdam | 43 | austrian | 0 | 59750 | Morriott International | Corlton | 1 | 1 | 0 |
| 4 | 4 | 724 | 235 | 263 | amsterdam | 43 | austrian | 0 | 59750 | Morriott International | Corlton | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 977519 | 82 | 502 | 102 | 13 | vilnius | 16 | swedish | 1 | 325723 | Boss Western | Boss Western | 0 | 0 | 0 |
| 977520 | 83 | 593 | 59 | 24 | vilnius | 16 | swedish | 1 | 325723 | Independant | Independant | 0 | 0 | 0 |
| 977521 | 84 | 294 | 62 | 3 | vilnius | 16 | swedish | 1 | 325723 | Independant | Independant | 1 | 0 | 0 |
| 977522 | 85 | 355 | 87 | 10 | vilnius | 16 | swedish | 1 | 325723 | Independant | Independant | 1 | 0 | 0 |
| 977523 | 86 | 551 | 62 | 12 | vilnius | 16 | swedish | 1 | 325723 | Yin Yang | 8 Premium | 1 | 0 | 0 |
977524 rows × 14 columns
Jeu de données df_train ne comprenant que nos dernières requètes car permet d'obtenir de meilleurs résultats :
display(df_train)
| Unnamed: 0 | hotel_id | price | stock | city | date | language | mobile | avatar_id | group | brand | parking | pool | children_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 591 | 332 | 9 | amsterdam | 5 | austrian | 0 | 199603 | Accar Hotels | Safitel | 0 | 0 | 0 |
| 1 | 1 | 256 | 82 | 0 | amsterdam | 5 | austrian | 0 | 199603 | Accar Hotels | Ibas | 1 | 0 | 0 |
| 2 | 2 | 111 | 154 | 4 | amsterdam | 5 | austrian | 0 | 199603 | Boss Western | Boss Western | 1 | 0 | 0 |
| 3 | 3 | 867 | 156 | 0 | amsterdam | 5 | austrian | 0 | 199603 | Independant | Independant | 1 | 0 | 0 |
| 4 | 4 | 12 | 83 | 0 | amsterdam | 5 | austrian | 0 | 199603 | Independant | Independant | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 612182 | 82 | 502 | 102 | 13 | vilnius | 16 | swedish | 1 | 325723 | Boss Western | Boss Western | 0 | 0 | 0 |
| 612183 | 83 | 593 | 59 | 24 | vilnius | 16 | swedish | 1 | 325723 | Independant | Independant | 0 | 0 | 0 |
| 612184 | 84 | 294 | 62 | 3 | vilnius | 16 | swedish | 1 | 325723 | Independant | Independant | 1 | 0 | 0 |
| 612185 | 85 | 355 | 87 | 10 | vilnius | 16 | swedish | 1 | 325723 | Independant | Independant | 1 | 0 | 0 |
| 612186 | 86 | 551 | 62 | 12 | vilnius | 16 | swedish | 1 | 325723 | Yin Yang | 8 Premium | 1 | 0 | 0 |
612187 rows × 14 columns
Notre jeu de données complet compte 977524 lignes et 14 colones (avant transformation). Le jeu de données dont nous allons nous servir dans la suite contient quant à lui 612187 lignes et 14 colones (avant transformation).
df_pred.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6644 entries, 0 to 6643 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 6644 non-null int64 1 order_requests 6644 non-null int64 2 city 6644 non-null object 3 date 6644 non-null int64 4 language 6644 non-null object 5 mobile 6644 non-null int64 6 avatar_id 6644 non-null int64 7 hotel_id 6644 non-null int64 8 stock 6644 non-null int64 9 group 6644 non-null object 10 brand 6644 non-null object 11 parking 6644 non-null int64 12 pool 6644 non-null int64 13 children_policy 6644 non-null int64 dtypes: int64(10), object(4) memory usage: 726.8+ KB
pandas_profiling.ProfileReport(df_pred)
df_pred['date'].value_counts()
2 484 1 467 0 464 5 458 6 450 3 449 4 443 16 281 18 279 21 268 15 267 35 264 17 264 20 252 38 250 37 241 36 237 19 235 34 222 39 189 40 180 Name: date, dtype: int64
plt.figure(figsize=(20,10))
plt.hist(df_pred['date'] ,40, density=True)
plt.show
plt.title("Histogramme des dates du jeu de données test", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Histogramme des dates du jeu de données test')
Le jeu de données test ne contient pas toutes le dates possibles. Il y a beaucoup de recherches pour des dates de 0 à 6 : il y en a 3215 ce qui représente près de la moitié du jeu de données test.
df_pred['stock'].describe()
count 6644.000000 mean 23.766105 std 32.582240 min 0.000000 25% 2.000000 50% 8.000000 75% 33.000000 max 199.000000 Name: stock, dtype: float64
plt.figure(figsize=(20,10))
plt.hist(df_pred['stock'] ,100, density=True)
plt.show
plt.title("Histogramme des stocks du jeu de données test", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Histogramme des stocks du jeu de données test')
df_pred['language'].describe()
count 6644 unique 27 top hungarian freq 967 Name: language, dtype: object
df_pred['language'].value_counts()
hungarian 967 finnish 466 austrian 461 romanian 439 slovakian 418 swedish 404 estonian 390 bulgarian 356 danish 346 irish 209 maltese 197 greek 181 slovene 174 belgian 153 spanish 141 dutch 134 polish 131 french 128 italian 128 luxembourgish 125 lithuanian 122 latvian 113 cypriot 106 czech 104 croatian 101 german 80 portuguese 70 Name: language, dtype: int64
Recherche en hongrois largement majoritaire (967). \ Très peu de recherche en protuguais (70).
df_pred['city'].describe()
count 6644 unique 9 top amsterdam freq 1134 Name: city, dtype: object
df_pred['city'].value_counts()
amsterdam 1134 paris 1125 madrid 1094 copenhagen 723 rome 721 vilnius 609 vienna 568 sofia 374 valletta 296 Name: city, dtype: int64
Amsterdam, Paris et Madrid les villes les plus présentes dans le jeu de données test.
df_pred['mobile'].value_counts()
0 3630 1 3014 Name: mobile, dtype: int64
bars = ('0', '1')
height = [3630,3014]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Recherche sur téléphone ou ordinateur", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Recherche sur téléphone ou ordinateur')
Il y a un peu plus de recherche sur ordinateur (0) que sur téléphone (1).
df_pred['parking'].value_counts()
1 3523 0 3121 Name: parking, dtype: int64
bars = ('0 (sans parking)', '1 (avec parking)')
height = [3121,3523]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Hôtels avec ou sans parking", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Hôtels avec ou sans parking')
df_pred['pool'].value_counts()
0 5452 1 1192 Name: pool, dtype: int64
bars = ('0 (sans piscine)', '1 (avec piscine)')
height = [5452,1192]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Hôtels avec ou sans piscine", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Hôtels avec ou sans piscine')
df_pred['children_policy'].value_counts()
0 5961 1 572 2 111 Name: children_policy, dtype: int64
bars = ('0 ', '1' , '2 ')
height = [5961,572,111]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Hôtels avec ou sans restrictions pour les enfants", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Hôtels avec ou sans restrictions pour les enfants')
corr = df_pred.corr()
px.imshow(corr,text_auto=True)
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 612187 entries, 0 to 612186 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 612187 non-null int64 1 hotel_id 612187 non-null int64 2 price 612187 non-null int64 3 stock 612187 non-null int64 4 city 612187 non-null object 5 date 612187 non-null int64 6 language 612187 non-null object 7 mobile 612187 non-null int64 8 avatar_id 612187 non-null int64 9 group 612187 non-null object 10 brand 612187 non-null object 11 parking 612187 non-null int64 12 pool 612187 non-null int64 13 children_policy 612187 non-null int64 dtypes: int64(10), object(4) memory usage: 65.4+ MB
pandas_profiling.ProfileReport(df_train)
plt.figure(figsize=(20,10))
plt.hist(df_train['price'],100, density=True)
plt.show
plt.title("Histogramme du prix", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Histogramme du prix')
fig = px.box(df_train, y="price")
fig.show()
fig = px.box(df_train, x="date", y="price")
fig.show()
fig = px.box(df_train, x="city", y="price")
fig.show()
# calcul de la moyenne des prix pour chaque ville
city_price = df_train.groupby('city')['price'].mean().reset_index()
# initialisation de geolocator
geolocator = Nominatim(user_agent="myGeocoder")
# création de la carte de l'Europe
europe_map = folium.Map(location=[54.5260, 15.2551], zoom_start=4)
# Ajout des coordonnées GPS de Sofia car sinon Sofia est localisé à Madagascar
sofia_lat = 42.6977
sofia_lon = 23.3219
#calcul du ratio pour varier la taille des bulles
max_price = df_train['price'].max()
min_price = df_train['price'].min()
# boucle pour créer les bulles
for i in range(0, len(city_price)):
if city_price.iloc[i]['city'] == "sofia":
folium.Circle(
location=[sofia_lat, sofia_lon],
popup=city_price.iloc[i]['city'],
radius=(city_price.iloc[i]['price']-min_price)/(max_price-min_price)*10**6,
color='crimson',
fill=True,
fill_color='crimson'
).add_to(europe_map)
else:
location = geolocator.geocode(city_price.iloc[i]['city'])
folium.Circle(
location=[location.latitude, location.longitude],
popup=city_price.iloc[i]['city'],
radius=(city_price.iloc[i]['price']-min_price)/(max_price-min_price)*10**6,
color='crimson',
fill=True,
fill_color='crimson'
).add_to(europe_map)
# affichage de la carte
europe_map
fig = px.box(df_train, x="language", y="price")
fig.show()
fig = px.box(df_train, x="mobile", y="price")
fig.show()
fig = px.box(df_train, x="parking", y="price")
fig.show()
fig = px.box(df_train, x="pool", y="price")
fig.show()
fig = px.box(df_train, x="children_policy", y="price")
fig.show()
df_train['mobile'].value_counts()
0 306149 1 306038 Name: mobile, dtype: int64
bars = ('0', '1')
height = [306149,306038]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Recherche sur téléphone ou ordinateur", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Recherche sur téléphone ou ordinateur')
df_train['pool'].value_counts()
0 508819 1 103368 Name: pool, dtype: int64
bars = ('0', '1')
height = [508819,103368]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Hotels avec ou sans piscine", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Hotels avec ou sans piscine')
df_train['parking'].value_counts()
0 344585 1 267602 Name: parking, dtype: int64
bars = ('0', '1')
height = [344585,267602]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Hotels avec ou sans parking", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Hotels avec ou sans parking')
df_train['children_policy'].value_counts()
0 582186 1 20860 2 9141 Name: children_policy, dtype: int64
bars = ('0', '1', '2')
height = [582186,20860,9141]
y_pos = np.arange(len(bars))
plt.figure(figsize=(20,10))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars)
plt.title("Politique enfant des hotels", fontsize = 30, color= "blue")
Text(0.5, 1.0, 'Politique enfant des hotels')
corr_train = df_train.corr()
px.imshow(corr_train,text_auto=True)
columnsTitles=['hotel_id', 'stock', 'date', 'mobile','parking', 'pool',
'children_policy', 'city_copenhagen', 'city_madrid', 'city_paris',
'city_rome', 'city_sofia', 'city_valletta', 'city_vienna',
'city_vilnius', 'language_belgian', 'language_bulgarian',
'language_croatian', 'language_cypriot', 'language_czech',
'language_danish', 'language_dutch', 'language_estonian',
'language_finnish', 'language_french', 'language_german',
'language_greek', 'language_hungarian', 'language_irish',
'language_italian', 'language_latvian', 'language_lithuanian',
'language_luxembourgish', 'language_maltese', 'language_polish',
'language_portuguese', 'language_romanian', 'language_slovakian',
'language_slovene', 'language_spanish', 'language_swedish',
'group_Boss Western', 'group_Chillton Worldwide', 'group_Independant',
'group_Morriott International', 'group_Yin Yang', 'brand_Ardisson',
'brand_Boss Western', 'brand_Chill Garden Inn', 'brand_Corlton',
'brand_CourtYord', 'brand_Ibas', 'brand_Independant',
'brand_J.Halliday Inn', 'brand_Marcure', 'brand_Morriot',
'brand_Navatel', 'brand_Quadrupletree', 'brand_Royal Lotus',
'brand_Safitel', 'brand_Tripletree']
def prepare_data(dataset, testset, columnsTitles, train_size) :
train = pd.get_dummies(dataset, drop_first = True)
pred_df = pd.get_dummies(testset, drop_first = True)
del train['Unnamed: 0']
del pred_df['index']
train_df, test_df = train_test_split(train, train_size = train_size)
X_train = train_df.drop(['price', 'avatar_id'], axis=1)
X_test = test_df.drop(['price', 'avatar_id'], axis=1)
X_pred = pred_df.drop(['order_requests', 'avatar_id'], axis=1)
X_pred = X_pred.reindex(columns=columnsTitles)
y_train = train_df['price']
y_test = test_df['price']
return(train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred)
Un premier essai en calculant le prix moyen par parmètre de variable catégorielle.
train, test = train_test_split(df_train, train_size=0.8)
villes = ["amsterdam", "paris", "copenhagen", "madrid", "rome", "sofia", "valletta", "vienna", "vilnius"]
langues = ["austrian", "belgian", "bulgarian", "croatian", "cypriot", "czech", "danish", "dutch", "estonian",
"finnish", "french", "german", "greek", "hungarian", "irish", "italian", "latvian", "lithuanian",
"luxembourgish", "maltese", "polish", "portuguese", "romanian", "slovakian", "slovene", "spanish",
"swedish"]
mobile = [0,1]
pool = [0,1]
parking = [0,1]
prix_moyen = []
for i in villes :
for j in list(range(len(mobile))):
for k in list(range(len(langues))):
for h in list(range(len(pool))):
data_ville = train[train["city"] == i ]
data_ville_mobile =data_ville[data_ville["mobile"] == mobile[j]]
data_ville_mobile_langue = data_ville_mobile[data_ville_mobile["language"] == langues[k]]
data_ville_mobile_langue_piscine = data_ville_mobile_langue[data_ville_mobile_langue["pool"]==pool[h]]
prix_moyen.append((np.mean(data_ville_mobile_langue_piscine['price']),i,j,langues[k],h))
def get_values(iterables, key_to_find):
return list(filter(lambda x:key_to_find in x, iterables))
df = pd.DataFrame()
for i in list(range(len(df_pred))):
city_test = df_pred._get_value(i, 'city')
language_test = df_pred._get_value(i, 'language')
mobile_test = df_pred._get_value(i, 'mobile') # on récupère les caractéristiques de chaque ligne du dataset test
pool_test = df_pred._get_value(i, 'pool')
test1 = get_values(prix_moyen,city_test)
test2 = get_values(test1, language_test)
test3 = get_values(test2, mobile_test)
test4 = get_values(test3, pool_test)
test_i = df_pred.loc[[i]]
test_i['price'] = test4[0][0]
df = pd.concat([df, test_i], axis=0)
pred = df['price']
df_mean = pd.DataFrame(df_pred['index'], columns=['index'])
df_mean['price'] = pred
df_mean.to_csv("submission_mean.csv", index=False)
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
model0 = LinearRegression()
model0.fit(X_train, y_train)
coefficients = model0.coef_
print(coefficients)
[ 4.81939640e-03 -4.53476615e-01 -3.18601395e-02 -1.73928667e-01 -5.06287165e-01 8.28098629e+00 3.28764262e+00 2.53360061e+01 -2.68811237e+01 1.47082558e+01 -1.36069031e+01 -6.12714903e+01 -6.45012646e+00 3.06399307e-01 -4.02384103e+01 -6.71191543e+00 -1.05255778e+01 -6.85947420e+00 -5.57639153e-01 -7.15386926e+00 -1.41095989e+00 -5.87668128e+00 -1.11268361e+01 1.75392688e+00 -5.82044917e+00 -6.46482225e+00 -2.97080316e+00 -1.26443295e+01 -3.39456454e+00 -6.66368666e+00 -2.08659980e+00 -4.36080550e+00 -6.97392699e+00 -5.44847464e+00 -4.78152378e+00 -2.25993029e+00 -1.08201930e+01 -1.39080036e+01 -6.74589882e+00 -4.91177059e+00 2.30161056e+00 -2.66135493e+09 1.68197560e+08 -2.41800879e+09 9.79710074e+07 -4.90645802e+08 8.39632478e+01 2.17070918e+09 -6.58843130e+08 -5.88616583e+08 -5.88616727e+08 -4.90645805e+08 1.92736301e+09 2.17070921e+09 -4.90645674e+08 -5.88616681e+08 -4.90645762e+08 -6.58843228e+08 1.31229494e+02 -4.90645571e+08 -6.58843280e+08]
y_pred_linear_regression = model0.predict(X_test)
print(math.sqrt(sum((y_test-y_pred_linear_regression)**2)/len(y_test)))
29.215286947905557
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_linear_regression)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
<matplotlib.legend.Legend at 0x7f99d7ea9430>
y_pred_reg_lin = model0.predict(X_pred)
prediction = y_pred_reg_lin.flatten()
df_linear_regression = pd.DataFrame(df_pred['index'], columns=['index'])
df_linear_regression['price'] = prediction
df_linear_regression.to_csv("submission_linear_regression.csv", index=False)
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
model = Lasso()
# définir la méthode de validation croisée
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# entrainer le modèle
search.fit(X_train, y_train)
GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=1),
estimator=Lasso(), n_jobs=-1,
param_grid={'alpha': array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
0.99])},
scoring='neg_mean_absolute_error')
#récupérer le meilleur alpha
best_alpha = search.best_params_['alpha']
print(best_alpha)
search.best_params_
model = Lasso(alpha=best_alpha)
model.fit(X_train, y_train)
coefs = model.coef_
print(coefs)
0.05 [ 4.93930359e-03 -4.30684161e-01 -9.56088046e-02 -4.71172237e-02 -2.16748387e-01 9.46435613e+00 3.34323926e+00 2.58613508e+01 -2.57704469e+01 1.53087891e+01 -1.25198711e+01 -5.98565581e+01 -4.65996853e+00 5.03932517e-01 -3.90347671e+01 -1.43807913e-01 -3.80198574e+00 -5.01989649e-01 3.31628839e+00 -5.80504075e-01 2.70694132e+00 -0.00000000e+00 -4.37567483e+00 5.85177218e+00 -0.00000000e+00 -0.00000000e+00 9.10971402e-01 -5.93687502e+00 4.10462729e-01 -1.77130055e-01 1.63542082e+00 0.00000000e+00 -4.17818479e-01 -0.00000000e+00 0.00000000e+00 1.57570510e+00 -4.21458198e+00 -7.38332076e+00 -1.96721309e-01 0.00000000e+00 6.44809701e+00 -0.00000000e+00 6.05889435e+01 -4.70227652e+01 5.59356894e+01 -6.68226732e+01 7.96046090e+01 -2.13345572e+01 9.91474447e+01 9.84229862e+01 -4.24242395e+01 -7.17468576e+01 -0.00000000e+00 7.00701843e+00 5.63426969e+01 0.00000000e+00 -2.86135869e+01 9.44641208e-01 1.26583223e+02 1.57965925e+02 -4.62477552e+01]
y_pred_lasso = model.predict(X_test)
print(math.sqrt(sum((y_test-y_pred_lasso)**2)/len(y_test)))
29.14401726921807
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_lasso)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
<matplotlib.legend.Legend at 0x1de1edb9430>
y_pred_lasso = model.predict(X_pred)
prediction = y_pred_lasso.flatten()
df_lasso = pd.DataFrame(df_pred['index'], columns=['index'])
df_lasso['price'] = prediction
df_lasso.to_csv("submission_lasso.csv", index=False)
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
#define cross-validation method to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
#define model
model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring = None)
#fit model
model.fit(X_train, y_train)
RidgeCV(alphas=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
0.99]),
cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=1))
y_pred_ridge = model.predict(X_test)
print(math.sqrt(sum((y_test-y_pred_ridge)**2)/len(y_test)))
29.167572791091942
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_ridge)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
<matplotlib.legend.Legend at 0x7fb675462100>
y_pred_ridge = model.predict(X_pred)
prediction = y_pred_ridge.flatten()
df_ridge = pd.DataFrame(df_pred['index'], columns=['index'])
df_ridge['price'] = prediction
df_ridge.to_csv("submission_ridge.csv", index=False)
def val_error_rate(X_test) :
y_pred_reseau_neurone = model.predict(X_test)
pred = y_pred_reseau_neurone.flatten()
err = math.sqrt(sum((y_test-pred)**2)/len(y_test))
return(err, y_pred_reseau_neurone)
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
def model1():
# Réseau feedforward
model = tf.keras.Sequential() #lance un reseau ou on va pouvoir mettre des couches
model.add(tf.keras.Input(shape=X_train.shape[1]))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128*32, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(128*8, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(128*2, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1))
# Compilation du modèle
model.compile(loss = tf.keras.losses.MeanSquaredError(), optimizer = 'adam', metrics = ['mae'])
return model
# Définition / construction du modèle
model = model1()
model.summary()
#entrainer le modèle
hist = model.fit(x=X_train, y=y_train, validation_data=(X_test,y_test),batch_size=100, epochs=30)
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
flatten (Flatten) (None, 61) 0
dense (Dense) (None, 4096) 253952
dropout (Dropout) (None, 4096) 0
dense_1 (Dense) (None, 1024) 4195328
dropout_1 (Dropout) (None, 1024) 0
dense_2 (Dense) (None, 256) 262400
dropout_2 (Dropout) (None, 256) 0
dense_3 (Dense) (None, 1) 257
=================================================================
Total params: 4,711,937
Trainable params: 4,711,937
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
4898/4898 [==============================] - 429s 87ms/step - loss: 1196.8094 - mae: 23.3742 - val_loss: 472.1890 - val_mae: 14.6002
Epoch 2/30
4898/4898 [==============================] - 382s 78ms/step - loss: 571.5869 - mae: 17.3625 - val_loss: 504.1821 - val_mae: 15.6306
Epoch 3/30
4898/4898 [==============================] - 338s 69ms/step - loss: 518.4857 - mae: 16.5121 - val_loss: 702.0512 - val_mae: 20.7065
Epoch 4/30
4898/4898 [==============================] - 330s 67ms/step - loss: 489.0778 - mae: 16.0061 - val_loss: 291.9595 - val_mae: 12.1548
Epoch 5/30
4898/4898 [==============================] - 352s 72ms/step - loss: 466.0066 - mae: 15.5835 - val_loss: 486.9381 - val_mae: 15.6669
Epoch 6/30
4898/4898 [==============================] - 369s 75ms/step - loss: 454.5993 - mae: 15.3383 - val_loss: 433.7540 - val_mae: 16.2548
Epoch 7/30
4898/4898 [==============================] - 361s 74ms/step - loss: 437.4363 - mae: 15.0241 - val_loss: 285.7250 - val_mae: 11.9301
Epoch 8/30
4898/4898 [==============================] - 377s 77ms/step - loss: 425.5475 - mae: 14.7618 - val_loss: 244.9270 - val_mae: 10.7251
Epoch 9/30
4898/4898 [==============================] - 360s 74ms/step - loss: 419.7678 - mae: 14.6249 - val_loss: 266.2148 - val_mae: 11.2746
Epoch 10/30
4898/4898 [==============================] - 459s 94ms/step - loss: 413.1473 - mae: 14.4881 - val_loss: 223.8964 - val_mae: 10.1416
Epoch 11/30
4898/4898 [==============================] - 405s 83ms/step - loss: 402.7527 - mae: 14.2718 - val_loss: 268.9597 - val_mae: 11.6525
Epoch 12/30
4898/4898 [==============================] - 352s 72ms/step - loss: 391.6926 - mae: 14.0622 - val_loss: 203.2007 - val_mae: 9.8923
Epoch 13/30
4898/4898 [==============================] - 325s 66ms/step - loss: 382.4816 - mae: 13.8741 - val_loss: 232.5506 - val_mae: 10.4452
Epoch 14/30
4898/4898 [==============================] - 358s 73ms/step - loss: 374.1945 - mae: 13.7204 - val_loss: 189.0706 - val_mae: 9.4799
Epoch 15/30
4898/4898 [==============================] - 377s 77ms/step - loss: 363.5745 - mae: 13.5209 - val_loss: 205.3480 - val_mae: 9.7517
Epoch 16/30
4898/4898 [==============================] - 392s 80ms/step - loss: 362.2123 - mae: 13.4507 - val_loss: 299.9102 - val_mae: 12.2979
Epoch 17/30
4898/4898 [==============================] - 389s 79ms/step - loss: 353.4474 - mae: 13.2841 - val_loss: 192.8981 - val_mae: 9.7117
Epoch 18/30
4898/4898 [==============================] - 405s 83ms/step - loss: 345.8791 - mae: 13.1479 - val_loss: 186.7222 - val_mae: 9.4029
Epoch 19/30
4898/4898 [==============================] - 393s 80ms/step - loss: 338.5510 - mae: 12.9662 - val_loss: 259.4239 - val_mae: 11.8310
Epoch 20/30
4898/4898 [==============================] - 391s 80ms/step - loss: 332.0409 - mae: 12.8489 - val_loss: 187.1340 - val_mae: 9.3076
Epoch 21/30
4898/4898 [==============================] - 388s 79ms/step - loss: 331.3576 - mae: 12.7917 - val_loss: 287.0209 - val_mae: 11.7205
Epoch 22/30
4898/4898 [==============================] - 389s 79ms/step - loss: 321.8459 - mae: 12.5984 - val_loss: 205.7450 - val_mae: 9.7931
Epoch 23/30
4898/4898 [==============================] - 389s 79ms/step - loss: 321.2900 - mae: 12.5640 - val_loss: 178.7525 - val_mae: 9.1586
Epoch 24/30
4898/4898 [==============================] - 405s 83ms/step - loss: 312.9505 - mae: 12.4119 - val_loss: 188.5901 - val_mae: 9.8260
Epoch 25/30
4898/4898 [==============================] - 393s 80ms/step - loss: 309.5163 - mae: 12.3043 - val_loss: 276.9434 - val_mae: 10.8740
Epoch 26/30
4898/4898 [==============================] - 383s 78ms/step - loss: 306.9549 - mae: 12.2484 - val_loss: 170.7793 - val_mae: 8.8224
Epoch 27/30
4898/4898 [==============================] - 1166s 238ms/step - loss: 300.0146 - mae: 12.1012 - val_loss: 161.4802 - val_mae: 8.7700
Epoch 28/30
4898/4898 [==============================] - 354s 72ms/step - loss: 293.6774 - mae: 11.9524 - val_loss: 206.8320 - val_mae: 10.2950
Epoch 29/30
4898/4898 [==============================] - 341s 70ms/step - loss: 291.7540 - mae: 11.9139 - val_loss: 176.9885 - val_mae: 9.1720
Epoch 30/30
4898/4898 [==============================] - 365s 75ms/step - loss: 288.8921 - mae: 11.8524 - val_loss: 158.8825 - val_mae: 8.5567
err, y_pred_reseau_neurone = val_error_rate(X_test)
print(err)
3827/3827 [==============================] - 47s 12ms/step 12.604862727098713
mae = hist.history['mae']
val_mae = hist.history['val_mae']
loss = hist.history['loss']
val_loss = hist.history['val_loss']
epochs = range(1, len(mae) + 1)
plt.plot(epochs, mae, 'bo', label='Training mae')
plt.plot(epochs, val_mae, 'b', label='Validation mae')
plt.title('Training and validation mae')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_reseau_neurone)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
<matplotlib.legend.Legend at 0x7ffdb3480be0>
y_pred_reseau_neurone = model.predict(X_pred)
prediction = y_pred_reseau_neurone.flatten()
df_rn = pd.DataFrame(df_pred['index'], columns=['index'])
df_rn['price'] = prediction
208/208 [==============================] - 2s 11ms/step
df_rn.to_csv("submission_neural_network1.csv", index=False)
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
def model2():
# Réseau feedforward
model = tf.keras.Sequential() #lance un reseau ou on va pouvoir mettre des couches
model.add(tf.keras.Input(shape=X_train.shape[1]))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128*8, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(128*4, activation = 'relu'))
model.add(tf.keras.layers.Dense(128*2, activation = 'relu'))
model.add(tf.keras.layers.Dense(128, activation = 'relu'))
model.add(tf.keras.layers.Dense(64, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1))
# Compilation du modèle
model.compile(loss = tf.keras.losses.MeanSquaredError(), optimizer = 'adam', metrics = ['mae'])
return model
# Définition / construction du modèle
model = model2()
model.summary()
#entrainer le modèle
hist2 = model.fit(x=X_train, y=y_train, validation_data=(X_test,y_test),batch_size=70, epochs=10)
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
flatten (Flatten) (None, 61) 0
dense (Dense) (None, 1024) 63488
dropout (Dropout) (None, 1024) 0
dense_1 (Dense) (None, 512) 524800
dense_2 (Dense) (None, 256) 131328
dense_3 (Dense) (None, 128) 32896
dense_4 (Dense) (None, 64) 8256
dropout_1 (Dropout) (None, 64) 0
dense_5 (Dense) (None, 1) 65
=================================================================
Total params: 760,833
Trainable params: 760,833
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
6997/6997 [==============================] - 108s 15ms/step - loss: 1438.7579 - mae: 26.2698 - val_loss: 565.4164 - val_mae: 18.5739
Epoch 2/10
6997/6997 [==============================] - 95s 14ms/step - loss: 798.2602 - mae: 20.5401 - val_loss: 464.6253 - val_mae: 16.6080
Epoch 3/10
6997/6997 [==============================] - 93s 13ms/step - loss: 719.5382 - mae: 19.3596 - val_loss: 326.0145 - val_mae: 12.7412
Epoch 4/10
6997/6997 [==============================] - 95s 14ms/step - loss: 664.8763 - mae: 18.5588 - val_loss: 274.6079 - val_mae: 11.6092
Epoch 5/10
6997/6997 [==============================] - 91s 13ms/step - loss: 638.2036 - mae: 18.0692 - val_loss: 355.6843 - val_mae: 14.0500
Epoch 6/10
6997/6997 [==============================] - 92s 13ms/step - loss: 610.1665 - mae: 17.6406 - val_loss: 252.7878 - val_mae: 11.2041
Epoch 7/10
6997/6997 [==============================] - 92s 13ms/step - loss: 592.2375 - mae: 17.3236 - val_loss: 272.6738 - val_mae: 11.8791
Epoch 8/10
6997/6997 [==============================] - 93s 13ms/step - loss: 569.7868 - mae: 16.9804 - val_loss: 382.6258 - val_mae: 14.6137
Epoch 9/10
6997/6997 [==============================] - 93s 13ms/step - loss: 555.9241 - mae: 16.6998 - val_loss: 229.5312 - val_mae: 10.6297
Epoch 10/10
6997/6997 [==============================] - 90s 13ms/step - loss: 532.0649 - mae: 16.3191 - val_loss: 490.6958 - val_mae: 14.1106
err, y_pred_reseau_neurone = val_error_rate(X_test)
print(err)
3827/3827 [==============================] - 13s 3ms/step 22.15165553630983
mae = hist2.history['mae']
val_mae = hist2.history['val_mae']
loss = hist2.history['loss']
val_loss = hist2.history['val_loss']
epochs = range(1, len(mae) + 1)
plt.plot(epochs, mae, 'bo', label='Training mae')
plt.plot(epochs, val_mae, 'b', label='Validation mae')
plt.title('Training and validation mae')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_reseau_neurone)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
<matplotlib.legend.Legend at 0x7fefa578e100>
y_pred_reseau_neurone = model.predict(X_pred)
prediction = y_pred_reseau_neurone.flatten()
df_rn = pd.DataFrame(df_pred['index'], columns=['index'])
df_rn['price'] = prediction
208/208 [==============================] - 1s 6ms/step
df_rn.to_csv("submission_neural_network2.csv", index=False)
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
regressor1 = DecisionTreeRegressor(random_state = 0)
regressor1.fit(X_train, y_train)
DecisionTreeRegressor(random_state=0)
y_pred_decision_tree = regressor1.predict(X_test)
print(math.sqrt(sum((y_test-y_pred_decision_tree)**2)/len(y_test)))
2.697622815148021
Le résultat donné par la plateforme Kaggle est de 30.12 environ. Nous avons donc un gros écart entre la RMSE que nous avons calculé et celle retourné sur la plateforme.
Cet écart peut être dû à différents facteurs comme par exemple la qualité des données avec des données de test et de prédiction différentes en termes de qualité, de complexité ou de bruit. En effet, les données de test peuvent être plus proches des données d'entraînement, ce qui facilite la généralisation des résultats. \ Un autre facteur peut être le nombre d'observations dans les jeux de données. Étant donné que le jeu de données de prédiction contient un faible nombre d'observations, il est possible que cela ait entraîné une incertitude plus élevée dans les prédictions et une augmentation de l'erreur. \ Enfin, il est possible que l'écart soit dû à la précision du modèle car celui-ci est sous-ajusté ou sur-ajusté.
plot_tree(regressor1)
plt.show()
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_decision_tree)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
<matplotlib.legend.Legend at 0x7fb3ac05af10>
Afin d'obtenir un meilleur résultat et d'éviter les cas de sur-apprentissage et sous-apprentissage, nous ajoutons des valeurs pour les imputs min_samples_leaf et max_depth. \ On va utiliser la méthode Grid Search pour tester plusieurs combinaisons de paramètres pour optimiser notre arbre de décision.
parameters = {'max_depth':range(2,61),'min_samples_leaf':range(2,7)}
reg = GridSearchCV(DecisionTreeRegressor(), parameters, n_jobs=-1)
reg.fit(X=X_train, y=y_train)
tree_model = reg.best_estimator_
print (reg.best_score_, reg.best_params_)
0.9984202237091335 {'max_depth': 58, 'min_samples_leaf': 2}
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
regressor2 = DecisionTreeRegressor(random_state = 0, min_samples_leaf = 2, max_depth = 58)
regressor2.fit(X_train, y_train)
DecisionTreeRegressor(max_depth=58, min_samples_leaf=2, random_state=0)
y_pred_decision_tree2 = regressor2.predict(X_test)
print(math.sqrt(sum((y_test-y_pred_decision_tree2)**2)/len(y_test)))
2.8863976579878243
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_decision_tree2)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
plot_tree(regressor2)
plt.show()
y_pred_arbre_decision = regressor1.predict(X_pred)
prediction = y_pred_arbre_decision.flatten()
df_dt = pd.DataFrame(df_pred['index'], columns=['index'])
df_dt['price'] = prediction
df_dt.to_csv("submission_decision_tree1.csv", index=False)
y_pred_arbre_decision2 = regressor2.predict(X_pred)
prediction2 = y_pred_arbre_decision2.flatten()
df_dt2 = pd.DataFrame(df_pred['index'], columns=['index'])
df_dt2['price'] = prediction2
df_dt2.to_csv("submission_decision_tree2.csv", index=False)
train_size = 0.8
train_df, test_df, pred_df, X_train, y_train, X_test, y_test, X_pred = prepare_data(df_train, df_pred, columnsTitles, train_size)
parameters = {'max_depth':range(58,59),'min_samples_leaf':range(2,3),'n_estimators':range(50,101)}
reg = GridSearchCV(RandomForestRegressor(), parameters, n_jobs=-1)
reg.fit(X=X_train, y=y_train)
rf_model = reg.best_estimator_
print (reg.best_score_, reg.best_params_)
regressor3 = RandomForestRegressor(n_estimators = 100,
random_state = 0,
min_samples_leaf = 2,
max_depth = 58)
regressor3.fit(X_train, y_train)
RandomForestRegressor(max_depth=58, min_samples_leaf=2, random_state=0)
y_pred_random_forest = regressor3.predict(X_test)
print(math.sqrt(sum((y_test-y_pred_random_forest)**2)/len(y_test)))
2.678776382760411
plt.style.use('fivethirtyeight')
plt.title('Performance du modèle')
plt.plot(list(y_pred_random_forest)[:30])
plt.plot(list(y_test)[:30])
plt.legend(['Prédictions', 'Prix réel'], loc='upper right')
<matplotlib.legend.Legend at 0x7ffd6e573d90>
importances = regressor3.feature_importances_
indices = np.argsort(importances)
liste_variables = list(X_train.columns)
# style du graphique
plt.style.use('fivethirtyeight')
%matplotlib inline
plt.figure(figsize=(20,10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [liste_variables[i] for i in indices])
plt.xlabel('Relative Importance')
Text(0.5, 0, 'Relative Importance')
pd.DataFrame(regressor3.feature_importances_,
index = X_train.columns,
columns = ["importance"]).sort_values(
"importance",
ascending = False)
| importance | |
|---|---|
| pool | 0.219690 |
| brand_Safitel | 0.090906 |
| brand_Corlton | 0.086904 |
| brand_Chill Garden Inn | 0.080682 |
| stock | 0.075326 |
| ... | ... |
| language_french | 0.000032 |
| language_italian | 0.000028 |
| language_maltese | 0.000025 |
| language_croatian | 0.000024 |
| language_slovene | 0.000023 |
61 rows × 1 columns
y_pred_foret_aleatoire = regressor3.predict(X_pred)
prediction = y_pred_foret_aleatoire.flatten()
df_rf = pd.DataFrame(df_pred['index'], columns=['index'])
df_rf['price'] = prediction
df_rf.to_csv("submission_random_forest.csv", index=False)